/* This dofile is to process and merge stream choice data with existing grade 10 database */

// open stream choice dataset 
import excel using "$raw/student/Class XI Students.xlsx", clear firstrow

g indexnumber = _n

rename *, lower

ren student_code studentid 
rename orgname schoolname 

ren cidno citizenid 

replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"
destring citizenid, replace

tostring citizenid, replace format("%20.0f")

replace schoolname = trim(itrim(lower(schoolname)))
tab schoolname

replace schoolname = "babesa higher secondary school" if schoolname == "babesa"
replace schoolname = "bajo higher secondary school" if schoolname == "bajo"
replace schoolname = "dashiding higher secondary school" if schoolname == "dashiding"
replace schoolname = "dechencholing" if schoolname == "dechencholing"
replace schoolname = "druk school" if schoolname == "druk"
replace schoolname = "gedu higher secondary school" if schoolname == "gedu"
replace schoolname = "karmaling higher secondary school" if schoolname == "karmaling"
replace schoolname = "karmaling higher secondary school" if schoolname == "lamgong"
replace schoolname = "lhuentse higher secondary school" if schoolname == "lhuentse"
replace schoolname = "mongar higher secondary school" if schoolname == "mongar"
replace schoolname = "motithang higher secondary school" if schoolname == "motithang"
replace schoolname = "pelkhil higher secondary school" if schoolname == "pelkhil"      
replace schoolname = "samdrupjongkhar" if schoolname == "samdrupjongkhar"      
replace schoolname = "samtse higher secondary school" if schoolname == "samtse"      
replace schoolname = "tashitse higher secondary school" if schoolname == "tashitse"      
replace schoolname = "yangchen gatshel middle secondary school" if schoolname == "yangchen gatshel"      
replace schoolname = "yangchenphug higher secondary school" if schoolname == "yangchenphug"      

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses

g b_month = month(dateofbirth)
g b_day = day(dateofbirth)
g b_year = year(dateofbirth)

g sex = 1 if gender == "Male"
replace sex = 2 if gender == "Female"

drop dateofbirth 

g double dateofbirth = b_year*10^4 + b_month*10^2 + b_day
format dateofbirth %20.0f 
tostring dateofbirth, replace 

* keep relevant information for now 
keep indexnumber name studentid citizenid b_* dateofbirth sex stream schoolname
foreach variable in name studentid citizenid sex b_month b_day b_year schoolname dateofbirth {
    rename `variable' `variable'_a
}

/* manual comparison with the survey data on student id, citizenid, date of bith, 
   the stream data switch four observations */ 
replace name_a = "sonamwangchuk" if studentid_a == "201.00130.11.0215"
replace name_a = "sangayphuntsho" if studentid_a == "201.00130.11.0216" 
replace name_a = "sonamyangchen" if studentid_a == "201.00300.11.0055"
replace name_a = "tashitobgay" if studentid_a == "201.00312.11.0738"

save "$temp/streamchoice.dta", replace 


********************************************************************************
*
*			MERGE WITH GRADE 10 DATA 
*
********************************************************************************

// STEP 1: PERFECT MATCHING ON STUDENT ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria studentid  

* open the stream choice data 
use "$temp/streamchoice.dta", clear 
rename studentid_a studentid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/stream_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename studentid studentid_a
save $temp/stream_step1d, replace


* open the survey data 
use "$temp/stem_analysis.dta", clear
replace name = subinstr(name," ","",.)  

format studentid_bl %20s

generate str citizenid_st = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = citizenid_st
drop citizenid_st
describe citizenid

save "$temp/data10.dta", replace 


use "$temp/data10.dta", clear
rename studentid_bl studentid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data10_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
rename studentid studentid_bl 
save $temp/data10_step1d, replace


* merging two unique data together using fully matched student ID & check fuzzy name 
use $temp/stream_step1u, clear
merge 1:1 $criteria using $temp/data10_step1u
matchit name_a name, g(namescore)
matchit dateofbirth_a dateofbirth, g(bscore)
sum namescore if _merge==3

* correct if names are not matched, leave for the next round of matching
replace _merge = . if (namescore <= 0.5 & bscore < 0.8) & _merge == 3

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
save $temp/stream_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/streamchoice.dta, keep(matched) nogen 
append using $temp/stream_step1d
save $temp/stream_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step1d
save $temp/data10_step2b, replace 
restore 


// STEP 2: PERFECT MATCHING ON CITIZEN ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria citizenid 

* open the stream choice data 
use $temp/stream_step2b.dta, clear 
rename citizenid_a citizenid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/stream_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_a
save $temp/stream_step2d, replace


* open the survey data 
use $temp/data10_step2b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data10_step2u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data10_step2d, replace


* merging two unique data together using fully matched citizenship ID & check fuzzy name 
use $temp/stream_step2u
merge 1:1 $criteria using $temp/data10_step2u
matchit name_a name, g(namescore)
matchit dateofbirth_a dateofbirth, g(bscore)

sum namescore if _merge==3

replace _merge = . if namescore < 0.7 & bscore < 0.9 & _merge == 3

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
save $temp/stream_step2m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/streamchoice.dta, keep(matched) nogen 
append using $temp/stream_step2d
save $temp/stream_step3b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step2d
save $temp/data10_step3b, replace 
restore  





// STEP 3: PERFECT MATCHING ON SEX + DATE OF BIRTH
********************************************************************************
gl criteria sex dateofbirth    

* open the stream choice data 
use $temp/stream_step3b.dta, clear 

rename dateofbirth_a dateofbirth 
rename sex_a sex

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/stream_step3u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap ren dateofbirth dateofbirth_a
cap ren sex sex_a
save $temp/stream_step3d, replace


* open the survey data 
use $temp/data10_step3b.dta, clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data10_step3u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
save $temp/data10_step3d, replace


* merging two unique data together 
use $temp/stream_step3u, clear
merge 1:1 $criteria using $temp/data10_step3u
matchit name_a name, g(namescore)
matchit studentid_a studentid_bl, g(idscore)
matchit citizenid_a citizenid, g(cidscore)

br name name_a *score if  _merge==3
sort name name_a

* correct if names are not matched, leave for the next round of matching
replace _merge = . if (namescore <= 0.6 | idscore < 0.8  & ~missing(studentid_bl) & ~missing(studentid_a)) & _merge == 3
replace _merge = . if studentid_a == "201.00269.11.0064" & studentid_bl == "201.00030.11.0210"

* keep matched data 
preserve 
keep if _merge==3 
keep indexnumber uniqueid 
save $temp/stream_step3m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep indexnumber 
merge 1:1 indexnumber using $temp/streamchoice.dta, keep(matched) nogen 
append using $temp/stream_step3d
save $temp/stream_step4b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step3d
save $temp/data10_step4b, replace 
restore  

// combine data together 
clear 
append using $temp/stream_step1m
g stream_match_criteria = "perfect(studentID) + fuzzy(name/dob)"

append using $temp/stream_step2m
replace stream_match_criteria = "perfect(CID) + fuzzy(name/dob)" if missing(stream_match_criteria)

append using $temp/stream_step3m
replace stream_match_criteria = "perfect(dob/sex) + fuzzy(studentID|CID|name)" if missing(stream_match_criteria)

merge 1:1 indexnumber using "$temp/streamchoice.dta", keep(matched) keepusing(stream) nogen
g s_stream_data = 1 
lab var s_stream_data "1/0 stream data available"

g science_stream = 0
replace science_stream = 1 if stream=="Science"
lab var science_stream "1/0 indicator for being enrolled in Science stream after lower secondary school (administrative data)"

g arts_stream = 0
replace arts_stream = 1 if stream=="Arts"
lab var arts_stream "1/0 indicator for being enrolled in Arts stream after lower secondary school (administrative data)"

g commerce_stream = 0
replace commerce_stream = 1 if stream=="Commerce"
lab var commerce_stream "1/0 indicator for being enrolled in Commerce stream after lower secondary school (administrative data)"

g rigzhung_stream = 0
replace rigzhung_stream = 1 if stream=="Rigzhung"
lab var rigzhung_stream "1/0 indicator for being enrolled in Rigzhung stream after lower secondary school (administrative data)"

save "$temp/grade10_analysis_gradeadm.dta", replace 





